#devtools::install_github(repo="haozhu233/kableExtra", ref="a6af5c0") #you have to install this specific version of kableextra
#This says rvest might have broke it https://github.com/haozhu233/kableExtra/issues/595
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.3 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(tidyverse)
restartspark <- function(){
#devtools::install_github("rstudio/sparklyr")
#devtools::install_github("rstudio/sparklyr")
#install.packages('sparklyr') #rolling back to the stable version
library(sparklyr)
#spark_available_versions()
#spark_installed_versions()
#spark_uninstall(version="3.0.1", hadoop_version="3.2")
#spark_uninstall(version="2.4.3", hadoop_version="2.7")
#oh interesting the default is spark 2.4.3 I wonder why that is
#Error: Java 11 is only supported for Spark 3.0.0+
#spark_install("3.0") #3.1.1 is currently the latest stable, but 3.0 is the latest available
#library(geospark)
#library(arrow)
mem="160G"
try({spark_disconnect(sc)})
conf <- spark_config()
#conf$`sparklyr.cores.local` <- 128
#https://datasystemslab.github.io/GeoSpark/api/sql/GeoSparkSQL-Parameter/
conf$spark.serializer <- "org.apache.spark.serializer.KryoSerializer"
#conf$spark.kryo.registrator <- "org.datasyslab.geospark.serde.GeoSparkKryoRegistrator"
#conf$spark.kryoserializer.buffer.max <- "2047MB" #Caused by: java.lang.IllegalArgumentException: spark.kryoserializer.buffer.max must be less than 2048 mb, got: + 10240 mb.
#https://github.com/DataSystemsLab/GeoSpark/issues/217
#conf$geospark.global.index <- "true"
#conf$geospark.global.indextype <- "quadtree"
#conf$geospark.join.gridtype <- "kdbtree"
#conf$spark.sql.shuffle.partitions <- 1999 #https://github.com/DataSystemsLab/GeoSpark/issues/361 #setting to just under 2k so compression doesn't kick in, don't need to lower the memory footprint
conf$spark.driver.maxResultSize <- "100G"
conf$spark.memory.fraction <- 0.9
conf$spark.storage.blockManagerSlaveTimeoutMs <-"6000000s" #Failed during initialize_connection: java.lang.IllegalArgumentException: requirement failed: spark.executor.heartbeatInterval should be less than or equal to spark.storage.blockManagerSlaveTimeoutMs
conf$spark.executor.heartbeatInterval <-"6000000s"# "10000000s"
conf$spark.network.timeout <- "6000001s"
conf$spark.local.dir <- "/mnt/8tb_b/spark_temp/"
conf$spark.worker.cleanup.enabled <- "true"
conf$"sparklyr.shell.driver-memory"= mem
conf$'spark.driver.maxResultSize' <- 0 #0 is ulimmited
conf$'spark.sql.legacy.parquet.datetimeRebaseModeInRead' <- 'LEGACY'
conf$'spark.sql.legacy.parquet.datetimeRebaseModeInWrite' <- 'LEGACY'
conf$'spark.sql.execution.arrow.maxRecordsPerBatch' <- "5000000" #https://github.com/arctern-io/arctern/issues/399
#Error: org.apache.spark.sql.AnalysisException: The pivot column variable_clean has more than 10000 distinct values, this could indicate an error. If this was intended, set spark.sql.pivotMaxValues to at least the number of distinct values of the pivot column.;
conf$'spark.sql.pivotMaxValues' <- "5000000"
sc <<- spark_connect(master = "local", config = conf#,
#version = "2.3.3" #for geospark
)
}
restartspark()
##
## Attaching package: 'sparklyr'
## The following object is masked from 'package:purrr':
##
## invoke
## The following object is masked from 'package:stats':
##
## filter
## Error in spark_disconnect(sc) : object 'sc' not found
yid_test <- spark_read_parquet(sc, path="/mnt/8tb_a/rwd_github_private/TrumpSupportVaccinationRates/results/performance/" ,memory=F)
performance_ablation <- yid_test %>% collect() %>% group_by(ablation) %>% summarize(rmse_y_hat_test=Metrics::rmse(y_hat_test_2percMSE, y_share14plus), rae_y_hat_test=Metrics::mae(y_hat_test_2percMSE, y_share14plus))
#performance_ablation %>% View()
performance_ablation %>% ggplot(aes(x=ablation %>% as.factor(), y=rmse_y_hat_test)) + geom_point() + coord_flip()
performance_ablation %>% ggplot(aes(x=ablation %>% as.factor(), y=rae_y_hat_test)) + geom_point() + coord_flip()
residuals <- yid_test %>% dplyr::select(ablation, fips, y_share14plus, y_hat_test_2percMSE) %>% collect() %>% mutate(residual=y_hat_test_2percMSE-y_share14plus) %>% mutate(fips_state = round(fips/1000 ))
residuals_state <- residuals %>% group_by(ablation, fips_state) %>% summarise(residual=mean(residual))
## `summarise()` has grouped output by 'ablation'. You can override using the `.groups` argument.
residuals_state_diff <- residuals_state %>%
left_join(residuals_state %>% mutate(ablation=ablation-1) %>% rename(residual_withoutinfo=residual) ) %>%
mutate(state_residual_change_from_adding_info= round(abs(residual_withoutinfo)-abs(residual),4) )
## Joining, by = c("ablation", "fips_state")
states_sf_tigris_continental <- readRDS( "/mnt/8tb_a/rwd_github_private/TrumpSupportVaccinationRates/data_out/states_sf_tigris_continental.Rds") %>%
janitor::clean_names() %>% mutate(fips_state=statefp %>% as.numeric() )
for(i in (unique(residuals_state_diff$ablation )-1 ) ){
p_test_folds <-
states_sf_tigris_continental %>%
left_join(residuals_state_diff %>% filter(ablation==i)) %>%
ggplot(aes(fill = state_residual_change_from_adding_info )) +
geom_sf() +
#ggtitle("1") +
scale_fill_gradient2("Changes\nMAE",midpoint=0, trans="reverse") + xlab("") + ylab("") +
theme_bw() +
theme(legend.position = c(0.90, 0.25)) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
p_test_folds
ggsave(filename=glue::glue("/mnt/8tb_a/rwd_github_private/TrumpSupportVaccinationRates/docs/plots/p_ablation_states_{i}.png"), plot = p_test_folds , height=6, width=12)
}
## Joining, by = "fips_state"
## Joining, by = "fips_state"
## Joining, by = "fips_state"
## Joining, by = "fips_state"
## Joining, by = "fips_state"
## Joining, by = "fips_state"
## Joining, by = "fips_state"
## Joining, by = "fips_state"
## Joining, by = "fips_state"
i=residuals_state_diff$ablation %>% max()
p_test_folds <-
states_sf_tigris_continental %>%
left_join(residuals_state_diff %>% filter(ablation==i)) %>%
mutate(state_residual_change_from_adding_info=NA) %>%
ggplot(aes(fill = state_residual_change_from_adding_info )) +
geom_sf() +
#ggtitle("1") +
scale_fill_gradient2("Changes\nMAE",midpoint=0, trans="reverse") + xlab("") + ylab("") +
theme_bw() +
theme(legend.position = c(0.90, 0.25)) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
## Joining, by = "fips_state"
p_test_folds
ggsave(filename=glue::glue("/mnt/8tb_a/rwd_github_private/TrumpSupportVaccinationRates/docs/plots/p_ablation_states_{i}.png"), plot = p_test_folds , height=6, width=12)
treeshap_all <- spark_read_parquet(sc, path="/mnt/8tb_a/rwd_github_private/TrumpSupportVaccinationRates/results/shap/" ,memory=F)
dim(treeshap_all)
## [1] NA 45
number_features <- treeshap_all %>%
dplyr::select(ablation, test_fold,variable) %>%
sdf_distinct() %>%
count(ablation, test_fold) %>%
collect() %>%
group_by(ablation) %>%
summarise(feature_count_min=min(n), feature_count_mean=mean(n), feature_count_max=max(n))
used_vars_df <- treeshap_all %>%
dplyr::group_by(ablation, k_smallest, variable) %>%
summarise(shap_variable_total= shap %>% abs() %>% sum() ) %>%
dplyr::group_by(ablation, k_smallest) %>%
mutate(shap_cluster_total=shap_variable_total %>% abs() %>% sum()) %>%
ungroup() %>%
#filter(shap_cluster_total==max(shap_cluster_total)) %>%
dplyr::arrange(ablation,shap_cluster_total %>% desc(),shap_variable_total %>% desc() ) %>%
collect()
## Warning: Missing values are always removed in SQL.
## Use `SUM(x, na.rm = TRUE)` to silence this warning
## This warning is displayed only once per session.
## Warning: Missing values are always removed in SQL.
## Use `SUM(x, na.rm = TRUE)` to silence this warning
## This warning is displayed only once per session.
rhs_codebook_total_clustered <- readRDS("/mnt/8tb_a/rwd_github_private/TrumpSupportVaccinationRates/data_out/rhs_codebook_total_clustered.Rds")
top_5_vars_by_abblation <- used_vars_df %>% group_by(ablation) %>% slice_head(n = 5) %>%
filter(shap_cluster_total==max(shap_cluster_total)) %>%
left_join(rhs_codebook_total_clustered %>% dplyr::select(variable=variable, description )) %>%
mutate(description_clean = description %>%
str_replace_all("_",' ') %>%
str_replace_all("donaldjtrump 2020",'Trump Vote AAAA') %>%
str_replace_all("donaldtrump 2016",'Trump Vote BBBB') %>%
str_replace_all("[0-9]{4}",'') %>%
str_replace_all("AAAA",'2020') %>%
str_replace_all("BBBB",'2016') %>%
str_replace_all("perc$|perc |percent of |per capita |percent ",'') %>%
str_replace_all("percap",'') %>%
str_replace_all("change in people ",'Δ ') %>%
str_replace_all("\\(.*?\\)",' ') %>%
str_replace_all("--total number of adherents",' ') %>%
str_replace_all(" years and over",'<') %>%
str_replace_all(" under ",'≥') %>%
str_replace_all(" ",' ') %>%
str_replace_all(" ,",',') %>%
trimws()
)
## Joining, by = "variable"
#top_5_vars_by_abblation %>% View()
#install.packages('formattable')
library(formattable)
top_5_vars_by_abblation %>%
mutate(shap_variable_total = color_bar("lightgreen")(shap_variable_total %>% round()) ) %>%
dplyr::select(ablation,k_smallest,description_clean,shap=shap_variable_total) %>%
mutate(ablation=ablation %>% as.character()) %>%
mutate(k_smallest=k_smallest %>% as.character()) %>%
kbl(format='html',booktabs = TRUE, longtable = TRUE, escape = F, align = 'c' ) %>% #
column_spec(3, width = "10cm" ) %>%
column_spec(4, width = "1cm" ) %>%
collapse_rows(columns = 1:2, valign = "top")
| ablation | k_smallest | description_clean | shap |
|---|---|---|---|
| 1 | 96 | politics Trump Vote 2020 | 584 |
| politics Trump Vote 2016 | 246 | ||
| politics johnmccain | 37 | ||
| politics mittromney | 7 | ||
| politics georgewbush | 3 | ||
| 2 | 94 | people who self report masking always | 104 |
| dentists | 65 | ||
| people who self report masking rarely | 58 | ||
| flu vaccinations medicare | 55 | ||
| primary care physicians | 54 | ||
| 3 | 53 | Health insurance coverage status by age, total, 19 to 64 years, no health insurance coverage | 363 |
| Health insurance coverage status by age, total, 19 to 64 years, no health insurance coverage | 35 | ||
| Health insurance coverage status by age, total,≥19 years, no health insurance coverage | 13 | ||
| 4 | 77 | religion southern baptist convention | 91 |
| religion catholic–total number of congregations | 61 | ||
| religion mainline protestant | 55 | ||
| religion mainline protestant | 44 | ||
| religion evangelical lutheran church in america | 40 | ||
| 5 | 70 | Total fields of bachelor’s degrees reported, total, science and engineeringsocial sciences | 258 |
| Total fields of bachelor’s degrees reported, total, arts, humanities, and otherliterature and languages | 74 | ||
| Field of bachelor’s degree for first major for the population 25 | 11 | ||
| Field of bachelor’s degree for first major the population 25 | 11 | ||
| Total fields of bachelor’s degrees reported, total | 6 | ||
| 6 | 78 | Δ employed by Private in NAICS 72 Accommodation and food services | 145 |
| people employed by Federal Government in NAICS 48-49 Transportation and warehousing | 79 | ||
| people employed by Local Government in NAICS 92 Public administration | 41 | ||
| people employed by Private in NAICS 44-45 Retail trade | 10 | ||
| Δ employed by Private in NAICS 71 Arts, entertainment, and recreation | 4 | ||
| 7 | 42 | Mortgage status by real estate taxes paid, total, not mortgaged, less than $800 | 97 |
| Mortgage status by aggregate real estate taxes paid, aggregate real estate taxes paid | 84 | ||
| Mortgage status by real estate taxes paid, total, not mortgaged, $3,000 or more | 20 | ||
| Mortgage status by median real estate taxes paid, median real estate taxes paid –total, median real estate taxes paid for units without a mortgage | 18 | ||
| Mortgage status by real estate taxes paid, total, not mortgaged, no real estate taxes paid | 17 | ||
| 8 | 6 | Place of birth by educational attainment in the united states, total, born in other state in the united states, less than high school graduate | 51 |
| Place of birth by nativity and citizenship status, total, native, born in other state in the united states, northeast | 48 | ||
| Place of birth in the united states, total, born in state of residence | 32 | ||
| Place of birth by educational attainment in the united states, total, born in other state in the united states, high school graduate | 31 | ||
| Place of birth by nativity and citizenship status, total, native, born in other state in the united states, south | 28 | ||
| 9 | 8 | Geographical mobility in the past year by educational attainment for current residence in the united states, total, graduate or professional degree | 101 |
| mobility workplaces change from baseline | 53 | ||
| Geographical mobility in the past year by educational attainment for residence 1 year ago in the united states, total living in area 1 year ago, same house, graduate or professional degree | 21 | ||
| Geographical mobility in the past year by educational attainment for residence 1 year ago in the united states, total living in area 1 year ago, graduate or professional degree | 19 | ||
| mobility population flows external ratio to internal | 15 |
Feature shapes
shap_by_covariate <- treeshap_all %>%
dplyr::select(ablation, variable, covariate_value, shap) %>%
right_join(top_5_vars_by_abblation, copy=T) %>%
#mutate(covariate_value_scaled_rounded=round(covariate_value_scaled, 1)) %>%
#group_by(ablation, variable, covariate_value_scaled_rounded) %>%
#summarise(shape_mean=mean(shap)) %>%
collect()
## Joining, by = c("ablation", "variable")
dim(shap_by_covariate) #8,748,504
## [1] 438239 9
top_5_shap_by_covariate <-
shap_by_covariate %>%
arrange(ablation, shap_cluster_total %>% desc() ) %>%
mutate(shap_variable_total_rank = (-1*shap_variable_total) %>% as.factor() %>% as.numeric() ) %>%
mutate(ablation_rank= ablation*1000 + shap_variable_total_rank) %>%
mutate(ablation_variable_i = ablation_rank %>% as.factor() %>% as.numeric() ) %>%
arrange(ablation_variable_i)
#dplyr::select(ablation,variable,shap_variable_total_rank) %>% distinct()
for(i in unique(top_5_shap_by_covariate$ablation_variable_i)){
p <- top_5_shap_by_covariate %>%
dplyr::filter(ablation_variable_i==i) %>%
ggplot(aes(x=covariate_value %>% scale(),y=shap)) +
#geom_point() +
geom_quantile(method = "rqss", lambda = 0.1, quantiles = c(0.05, 0.95), na.rm=T, linetype=c('solid'), color="black", size=0.25) +
geom_quantile(method = "rqss", lambda = 0.1, quantiles = c(0.5), na.rm=T, linetype=c('solid'), color="black") +
theme_void() +
geom_hline(yintercept=0, linetype="dashed", color = "blue", size=0.25) +
geom_vline(xintercept=0, linetype="dashed", color = "blue", size=0.25)
ggsave(filename=glue::glue("/mnt/8tb_a/rwd_github_private/TrumpSupportVaccinationRates/docs/plots/p_ablation_marginal_fx_{i}.png"),
plot = p , height=0.5, width=1)
}
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Warning in rq.fit.sfn(x, y, tau = tau, rhs = rhs, control = control, ...): tiny diagonals replaced with Inf when calling blkfct
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Warning in rq.fit.sfn(x, y, tau = tau, rhs = rhs, control = control, ...): tiny diagonals replaced with Inf when calling blkfct
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Warning in rq.fit.sfn(x, y, tau = tau, rhs = rhs, control = control, ...): tiny diagonals replaced with Inf when calling blkfct
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Warning in rq.fit.sfn(x, y, tau = tau, rhs = rhs, control = control, ...): tiny diagonals replaced with Inf when calling blkfct
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Warning in rq.fit.sfn(x, y, tau = tau, rhs = rhs, control = control, ...): tiny diagonals replaced with Inf when calling blkfct
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Warning in rq.fit.sfn(x, y, tau = tau, rhs = rhs, control = control, ...): tiny diagonals replaced with Inf when calling blkfct
## Warning in rq.fit.sfn(x, y, tau = tau, rhs = rhs, control = control, ...): tiny diagonals replaced with Inf when calling blkfct
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Warning in rq.fit.sfn(x, y, tau = tau, rhs = rhs, control = control, ...): tiny diagonals replaced with Inf when calling blkfct
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
## Smoothing formula not specified. Using: y ~ qss(x, lambda = 0.1)
#top_5_shap_by_covariate <- shap_by_covariate %>%
# arrange(ablation, shap_cluster_total %>% desc(),
# shap_variable_total %>% desc(),covariate_value_scaled_rounded) %>%
# mutate(groups=paste0(ablation,variable))
#top_5_shap_by_covariate_list <- split(top_5_shap_by_covariate$shape_mean,top_5_shap_by_covariate$groups)
#depricated now
#, results="asis"
#of O remder to html it works but not other scenario
library(tidyverse)
library(kableExtra)
setwd("/mnt/8tb_a/rwd_github_private/TrumpSupportVaccinationRates/docs")
library(here)
#kable_styling(latex_options = c("hold_position", "repeat_header"), bootstrap_options = c('striped')) %>%
#column_spec(5,
# image = spec_image(
# #You have to knit to html and you have to use the file:/// prefix or it all breaks
# sapply(performance_ablation$ablation, FUN=function(i) glue::glue("file:///mnt/8tb_a/rwd_github_private/TrumpSupportVaccinationRates/docs/plots/p_ablation_states_{i}.png" )),
# 400, 200)
# ) %>%
#column_spec(3, width = "10cm" ) %>%
performance_ablation %>%
left_join(number_features %>% dplyr::select(ablation,feature_count_mean)) %>%
dplyr::select(Round=ablation , RMSE=rmse_y_hat_test , MAE=rae_y_hat_test, Features=feature_count_mean ) %>%
mutate(Features= round(Features,0)) %>%
mutate(RMSE= round(RMSE*100,2 )) %>%
mutate(MAE= round(MAE*100,2 )) %>%
mutate(MAE_Delta ="") %>%
#mutate(RMSE = color_bar("orange")(RMSE ) ) %>% #too close doesn't look good
#mutate(MAE = color_bar("blue")(MAE ) ) %>%
kbl(format='html',booktabs = TRUE, longtable = TRUE, escape = F) %>% #, align = 'c'
kable_paper(full_width = F) %>%
kable_styling(latex_options = c("hold_position", "repeat_header"), bootstrap_options = c('striped')) %>%
column_spec(5,
image = spec_image(
#You have to knit to html and you have to use the file:/// prefix or it all breaks
sapply(performance_ablation$ablation, FUN=function(i) glue::glue("file:///mnt/8tb_a/rwd_github_private/TrumpSupportVaccinationRates/docs/plots/p_ablation_states_{i}.png" )),
400, 200)
) %>%
column_spec(6, image = spec_plot(mpg_list, same_lim = TRUE)) %>%
Combine the two tables
temp<-
performance_ablation %>%
left_join(number_features %>% dplyr::select(ablation,feature_count_mean)) %>%
left_join(
top_5_vars_by_abblation %>%
dplyr::select(ablation,k_smallest,description_clean,shap=shap_variable_total)
) %>%
mutate(MAE_Delta ="") %>%
dplyr::select(ablation , rmse_y_hat_test , rae_y_hat_test, feature_count_mean, MAE_Delta, description_clean, shap) %>%
mutate(feature_count_mean= round(feature_count_mean,0)) %>%
mutate(rmse_y_hat_test= round(rmse_y_hat_test*100,2 )) %>%
mutate(rae_y_hat_test= round(rae_y_hat_test*100,2 )) %>%
mutate(shap= round(shap )) %>%
#Alternatively there's a markdown way to do this
#https://www.titanwolf.org/Network/q/a471f9e2-3921-4292-bd69-467a36114657/y
mutate(MAE_Delta= sprintf("",
sapply(top_5_vars_by_abblation$ablation, FUN=function(i) glue::glue("file:///mnt/8tb_a/rwd_github_private/TrumpSupportVaccinationRates/docs/plots/p_ablation_states_{i}.png" )) ) ) %>%
#mutate(RMSE = color_bar("orange")(RMSE ) ) %>% #too close doesn't look good
#mutate(MAE = color_bar("blue")(MAE ) ) %>%
mutate(MarginalFX=glue::glue("}.png)"))
## Joining, by = "ablation"
## Joining, by = "ablation"
dim(temp)
## [1] 43 8
#ggsave(filename=glue::glue("/mnt/8tb_a/rwd_github_private/TrumpSupportVaccinationRates/docs/plots/p_ablation_marginal_fx_{i}.png"), plot = p , height=0.5, width=1)
temp %>%
kbl(format='html', padding=0 ,
booktabs = TRUE,
longtable = TRUE,
escape = F,
align = c('l', 'c', 'c', 'c', 'c', 'l', 'c'),
col.names = c("", "RMSE", "MAE", "Feat.","Δ MAE by St.","Feature","Shap","Marginal Fx.")
) %>%
kable_paper("striped", full_width = F) %>%
kable_styling(bootstrap_options="condensed") %>%
row_spec(row=0, angle = 0, align='l', hline_after=T) %>%
add_header_above(c(" " = 1, "Model Performance (Cumulative Ablation of Top Performing Feature Clusters)" = 4, "Feature Performance (Top Cluster)" = 3)) %>%
column_spec(5, width = "4cm" ) %>%
column_spec(8, width = "1cm" ) %>%
#column_spec(8, image = spec_plot(top_5_shap_by_covariate_list, same_lim = F)) %>%
collapse_rows(columns = 1:5, valign = "top")
|
Model Performance (Cumulative Ablation of Top Performing Feature Clusters)
|
Feature Performance (Top Cluster)
|
||||||
|---|---|---|---|---|---|---|---|
| RMSE | MAE | Feat. | Δ MAE by St. | Feature | Shap | Marginal Fx. | |
| 1 | 7.87 | 5.90 | 70 |
|
politics Trump Vote 2020 | 584 |
|
| politics Trump Vote 2016 | 246 |
|
|||||
| politics johnmccain | 37 |
|
|||||
| politics mittromney | 7 |
|
|||||
| politics georgewbush | 3 |
|
|||||
| 2 | 8.49 | 6.43 | 58 |
|
people who self report masking always | 104 |
|
| dentists | 65 |
|
|||||
| people who self report masking rarely | 58 |
|
|||||
| flu vaccinations medicare | 55 |
|
|||||
| primary care physicians | 54 |
|
|||||
| 3 | 8.87 | 6.66 | 51 |
|
Health insurance coverage status by age, total, 19 to 64 years, no health insurance coverage | 363 |
|
| Health insurance coverage status by age, total, 19 to 64 years, no health insurance coverage | 35 |
|
|||||
| Health insurance coverage status by age, total,≥19 years, no health insurance coverage | 13 |
|
|||||
| 4 | 8.81 | 6.61 | 65 |
|
religion southern baptist convention | 91 |
|
| religion catholic–total number of congregations | 61 |
|
|||||
| religion mainline protestant | 55 |
|
|||||
| religion mainline protestant | 44 |
|
|||||
| religion evangelical lutheran church in america | 40 |
|
|||||
| 5 | 8.73 | 6.59 | 52 |
|
Total fields of bachelor’s degrees reported, total, science and engineeringsocial sciences | 258 |
|
| Total fields of bachelor’s degrees reported, total, arts, humanities, and otherliterature and languages | 74 |
|
|||||
| Field of bachelor’s degree for first major for the population 25 | 11 |
|
|||||
| Field of bachelor’s degree for first major the population 25 | 11 |
|
|||||
| Total fields of bachelor’s degrees reported, total | 6 |
|
|||||
| 6 | 8.92 | 6.75 | 60 |
|
Δ employed by Private in NAICS 72 Accommodation and food services | 145 |
|
| people employed by Federal Government in NAICS 48-49 Transportation and warehousing | 79 |
|
|||||
| people employed by Local Government in NAICS 92 Public administration | 41 |
|
|||||
| people employed by Private in NAICS 44-45 Retail trade | 10 |
|
|||||
| Δ employed by Private in NAICS 71 Arts, entertainment, and recreation | 4 |
|
|||||
| 7 | 8.95 | 6.83 | 72 |
|
Mortgage status by real estate taxes paid, total, not mortgaged, less than $800 | 97 |
|
| Mortgage status by aggregate real estate taxes paid, aggregate real estate taxes paid | 84 |
|
|||||
| Mortgage status by real estate taxes paid, total, not mortgaged, $3,000 or more | 20 |
|
|||||
| Mortgage status by median real estate taxes paid, median real estate taxes paid –total, median real estate taxes paid for units without a mortgage | 18 |
|
|||||
| Mortgage status by real estate taxes paid, total, not mortgaged, no real estate taxes paid | 17 |
|
|||||
| 8 | 8.93 | 6.80 | 65 |
|
Place of birth by educational attainment in the united states, total, born in other state in the united states, less than high school graduate | 51 |
|
| Place of birth by nativity and citizenship status, total, native, born in other state in the united states, northeast | 48 |
|
|||||
| Place of birth in the united states, total, born in state of residence | 32 |
|
|||||
| Place of birth by educational attainment in the united states, total, born in other state in the united states, high school graduate | 31 |
|
|||||
| Place of birth by nativity and citizenship status, total, native, born in other state in the united states, south | 28 |
|
|||||
| 9 | 8.85 | 6.77 | 75 |
|
Geographical mobility in the past year by educational attainment for current residence in the united states, total, graduate or professional degree | 101 |
|
| mobility workplaces change from baseline | 53 |
|
|||||
| Geographical mobility in the past year by educational attainment for residence 1 year ago in the united states, total living in area 1 year ago, same house, graduate or professional degree | 21 |
|
|||||
| Geographical mobility in the past year by educational attainment for residence 1 year ago in the united states, total living in area 1 year ago, graduate or professional degree | 19 |
|
|||||
| mobility population flows external ratio to internal | 15 |
|
|||||